This data set gives annual statistics by age group, race, ethnicity, and gender on the number of people who have benefited from homelessness assistance services.
The Homelessness Data Integration System (HDIS), an extensive data repository that compiles and examines information from each of California’s 44 Continuums of Care (CoC), is the source of this information. Every Community of Communities (CoC) collects and disseminates information on the people it serves via a range of programs, such as those aimed at ending homelessness, offering outreach services to the homeless, assisting with permanent housing solutions, and other projects in line with California’s Housing First objectives.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
age_demo = read.csv("./original data/demo/age_demo.csv")
ethnicity_demo = read.csv("./original data/demo/ethnicity_demo.csv")
gender_demo = read.csv("./original data/demo/gender_demo.csv")
race_demo = read.csv("./original data/demo/race_demo.csv")
head(age_demo)
## CALENDAR_YEAR LOCATION_ID LOCATION AGE_GROUP_PUBLIC
## 1 2017 All California 18-24
## 2 2017 All California 25-34
## 3 2017 All California 35-44
## 4 2017 All California 45-54
## 5 2017 All California 55-64
## 6 2017 All California 65+
## EXPERIENCING_HOMELESSNESS
## 1 15984
## 2 28654
## 3 25831
## 4 27651
## 5 23396
## 6 7111
head(ethnicity_demo)
## CALENDAR_YEAR LOCATION_ID LOCATION ETHNICITY
## 1 2017 All California Hispanic/Latinx
## 2 2017 All California Not Hispanic/Latinx
## 3 2017 All California Unknown
## 4 2017 CA-500 Santa Clara County CoC Hispanic/Latinx
## 5 2017 CA-500 Santa Clara County CoC Not Hispanic/Latinx
## 6 2017 CA-500 Santa Clara County CoC Unknown
## EXPERIENCING_HOMELESSNESS
## 1 60962
## 2 119153
## 3 3791
## 4 5034
## 5 5074
## 6 57
head(gender_demo)
## CALENDAR_YEAR LOCATION_ID LOCATION GENDER
## 1 2017 All California Female
## 2 2017 All California Male
## 3 2017 All California Non-Singular Gender
## 4 2017 All California Questioning Gender
## 5 2017 All California Transgender
## 6 2017 All California Unknown
## EXPERIENCING_HOMELESSNESS
## 1 79670
## 2 101901
## 3 148
## 4 *
## 5 676
## 6 1505
head(race_demo)
## CALENDAR_YEAR LOCATION_ID LOCATION
## 1 2017 All California
## 2 2017 All California
## 3 2017 All California
## 4 2017 All California
## 5 2017 All California
## 6 2017 All California
## RACE EXPERIENCING_HOMELESSNESS
## 1 American Indian, Alaska Native, or Indigenous 5638
## 2 Asian or Asian American 3005
## 3 Black, African American, or African 57665
## 4 Multiple Races 9048
## 5 Native Hawaiian or Pacific Islander 2555
## 6 Unknown 6391
library(readr)
folder_path <- "./original data/demo/"
csv_files <- list.files(folder_path, pattern = ".csv")
# define a function to process a CSV file
clean_csv <- function(input_file, output_file) {
data <- read.csv(input_file)
data_cleaned <- data |>
filter(EXPERIENCING_HOMELESSNESS != "*") |>
select(-LOCATION_ID) |>
janitor::clean_names()
write.csv(data_cleaned, file = output_file, row.names = FALSE)
#return(data_cleaned)
}
output_folder <- "./processed data/demo/"
# process CSV files using the lapply function
cleaned_data_list <- lapply(csv_files, function(file) {
input_file <- paste0(folder_path, file)
output_file <- paste0(output_folder, "cleaned_", file)
clean_csv(input_file, output_file)
})
# import clean data
age_clean = read.csv("./processed data/demo/cleaned_age_demo.csv")
ethnicity_clean = read.csv("./processed data/demo/cleaned_ethnicity_demo.csv")
gender_clean = read.csv("./processed data/demo/cleaned_gender_demo.csv")
race_clean = read.csv("./processed data/demo/cleaned_race_demo.csv")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
age_plot <- age_clean |>
select(-location) |>
group_by(calendar_year,age_group_public) |>
summarise(total_homelessness = sum(experiencing_homelessness))
## `summarise()` has grouped output by 'calendar_year'. You can override using the
## `.groups` argument.
age_plot |>
mutate(text_label = str_c("Year: ", calendar_year, "\nAge Group: ", age_group_public)) |>
plot_ly(x = ~calendar_year, y = ~total_homelessness, type = "scatter", mode = "line", color = ~age_group_public, colors = "viridis", text = ~text_label, alpha = 0.8) |>
layout(title = "Total Homelessness by Year and Age Group")
ethnicity_plot <- ethnicity_clean |>
select(-location) |>
group_by(calendar_year,ethnicity) |>
summarise(total_homelessness = sum(experiencing_homelessness))
## `summarise()` has grouped output by 'calendar_year'. You can override using the
## `.groups` argument.
ethnicity_plot |>
mutate(text_label = str_c("Year: ", calendar_year, "\nEthnicity: ", ethnicity)) |>
plot_ly(x = ~calendar_year, y = ~total_homelessness, type = "bar", color = ~ethnicity, colors = "viridis", text = ~text_label, alpha = 0.8) |>
layout(
title = "Total Homelessness by Year and ethnicity")
gender_plot <- gender_clean |>
select(-location) |>
group_by(calendar_year,gender) |>
summarise(total_homelessness = sum(experiencing_homelessness))
## `summarise()` has grouped output by 'calendar_year'. You can override using the
## `.groups` argument.
gender_plot |>
mutate(text_label = str_c("Year: ", calendar_year, "\nGender: ", gender)) |>
plot_ly(x = ~calendar_year, y = ~total_homelessness, type = "bar", color = ~gender, colors = "viridis", text = ~text_label, alpha = 0.8) |>
layout(
title = "Total Homelessness by Year and gender")
race_plot <- race_clean |>
select(-location) |>
group_by(calendar_year,race) |>
summarise(total_homelessness = sum(experiencing_homelessness))
## `summarise()` has grouped output by 'calendar_year'. You can override using the
## `.groups` argument.
race_plot |>
mutate(text_label = str_c("Year: ", calendar_year, "\nRace: ", race)) |>
plot_ly(x = ~calendar_year, y = ~total_homelessness, type = "scatter", mode = "line", color = ~race, colors = "viridis", text = ~text_label, alpha = 0.8) |>
layout(
title = "Total Homelessness by Year and gender" ,
legend = list(
x = 1,
y = 0.5,
traceorder = "normal",
bgcolor = "white",
bordercolor = "white",
borderwidth = 0.5
))
# pie chart about the race
race_clean |>
group_by(race) |>
summarise(total_homelessness = sum(experiencing_homelessness)) |>
mutate(percentage = total_homelessness / sum(total_homelessness)) |>
plot_ly(labels = ~race, values = ~percentage, type = "pie", hole = 0.4) |>
layout(title = "Percentage of Total Homeless by Race")